This notebook contains scripts that evaluate the informedness of TCS compared to cluster-based statistic at different sample sizes (supplementary analyses).
Loading required packages
import os
import numpy as np
import pandas as pd
import nibabel as nib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from tqdm.notebook import tqdm
Basic functions
def ensure_dir(file_name):
os.makedirs(os.path.dirname(file_name), exist_ok=True)
return file_name
def write_np(np_obj, file_path):
with open(file_path, 'wb') as outfile:
np.save(outfile, np_obj)
def load_np(file_path):
with open(file_path, 'rb') as infile:
return np.load(infile)
Plot settings (latex is used for better plotting)
sns.set()
sns.set_style("darkgrid")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{mathtools} \usepackage{sfmath}')
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
plt.rc('axes', labelsize=24)
plt.rc('figure', dpi=500)
The ground truth stored in notebook 2 is loaded here.
# list of all tasks and the cope number related to each selected contrast
tasks = {
'EMOTION': '3', # faces - shapes
'GAMBLING': '6', # reward - punish
'RELATIONAL': '4', # rel - match
'SOCIAL': '6', # tom - random
'WM': '20', # face - avg
}
# Compute mean and std, followed by a parametric z-score (one sample t-test)
ground_truth_effect = {}
# Base directory where files are stored at
base_dir='/data/netapp01/work/sina/structural_clustering/PALM_revision_1'
for task in tqdm(tasks, desc="Tasks loop", leave=True):
ground_truth_effect[task] = load_np(
'{}/ground_truth/cohen_d_{}_cope{}.dscalar.npy'.format(base_dir, task, tasks[task]),
)
Tasks loop: 0%| | 0/5 [00:00<?, ?it/s]
PALM results stored in notebook 1 is loaded here.
%%time
# Number of random repetitions
repetitions = 500
# Different sample sizes tested
sample_sizes = [10, 20, 40, 80, 160, 320]
# Different cluster defining thresholds
cdts = [3.3, 2.8, 2.6, 2.0, 1.6]
# Number of brainordinates in a cifti file
Nv = 91282
# Base directory where files are stored at
base_dir='/data/netapp01/work/sina/structural_clustering/PALM_revision_1'
# Store loaded results in nested python dictionaries
loaded_maps = {}
loaded_maps['uncorrected_tstat'] = {}
loaded_maps['spatial_cluster_corrected_tstat'] = {}
loaded_maps['topological_cluster_corrected_tstat'] = {}
# Only use the z=3.3, p=0.001 for the main analyses reported here
cdt = 3.3
for task in tqdm(tasks, desc="Tasks loop", leave=True):
loaded_maps['uncorrected_tstat'][task] = {}
loaded_maps['spatial_cluster_corrected_tstat'][task] = {}
loaded_maps['topological_cluster_corrected_tstat'][task] = {}
for sample_size in tqdm(sample_sizes, desc="Sample size loop", leave=False):
loaded_maps['uncorrected_tstat'][task][f'N={sample_size}'] = load_np(
f'{base_dir}/summary/uncorrected_tstat_{task}_{sample_size}_samples_{cdt}_CDT.npy',
)
loaded_maps['spatial_cluster_corrected_tstat'][task][f'N={sample_size}'] = load_np(
ensure_dir(f'{base_dir}/summary/spatial_cluster_corrected_tstat_{task}_{sample_size}_samples_{cdt}_CDT.npy'),
)
loaded_maps['topological_cluster_corrected_tstat'][task][f'N={sample_size}'] = load_np(
ensure_dir(f'{base_dir}/summary/topological_cluster_corrected_tstat_{task}_{sample_size}_samples_{cdt}_CDT.npy'),
)
Tasks loop: 0%| | 0/5 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
CPU times: user 222 ms, sys: 1min 4s, total: 1min 4s Wall time: 5min 41s
Script below generates the results of informedness analysis reported in the manuscript.
For more information, refer to the following:
%%time
# number of repititions for every sample size (repeated subsampling)
repetitions = 500
Nv = 91282
logp_threshold = -np.log10(0.05)
ground_truth_effect_thresholds = np.linspace(0.01,0.2,)
# bookmarker informedness
BM = {}
TPR = {}
FPR = {}
for task in tqdm(tasks, desc="Tasks loop", leave=True):
BM[task] = {}
TPR[task] = {}
FPR[task] = {}
for sample_size in tqdm(sample_sizes, desc="Sample size loop", leave=False):
BM[task][f'N={sample_size}'] = {}
TPR[task][f'N={sample_size}'] = {}
FPR[task][f'N={sample_size}'] = {}
BM[task][f'N={sample_size}']['spatial'] = []
BM[task][f'N={sample_size}']['topological'] = []
TPR[task][f'N={sample_size}']['spatial'] = []
TPR[task][f'N={sample_size}']['topological'] = []
FPR[task][f'N={sample_size}']['spatial'] = []
FPR[task][f'N={sample_size}']['topological'] = []
t_stats = loaded_maps['uncorrected_tstat'][task][f'N={sample_size}']
t_stats = t_stats[~np.isnan(t_stats).any(axis=1)]
spatial_cluster_logps = loaded_maps['spatial_cluster_corrected_tstat'][task][f'N={sample_size}']
spatial_cluster_logps = spatial_cluster_logps[~np.isnan(spatial_cluster_logps).any(axis=1)]
# statistical predictions
spatial_increased_activation = (spatial_cluster_logps>logp_threshold) & (t_stats>0)
spatial_decreased_activation = (spatial_cluster_logps>logp_threshold) & (t_stats<0)
spatial_no_change_in_activation = (spatial_cluster_logps<logp_threshold)
topological_cluster_logps = loaded_maps['topological_cluster_corrected_tstat'][task][f'N={sample_size}']
topological_cluster_logps = topological_cluster_logps[~np.isnan(topological_cluster_logps).any(axis=1)]
# statistical predictions
topological_increased_activation = (topological_cluster_logps>logp_threshold) & (t_stats>0)
topological_decreased_activation = (topological_cluster_logps>logp_threshold) & (t_stats<0)
topological_no_change_in_activation = (topological_cluster_logps<logp_threshold)
for ground_truth_effect_threshold in tqdm(ground_truth_effect_thresholds, desc="Thresholds loop", leave=False):
# ground truth conditions
true_increased_activation = ground_truth_effect[task] > ground_truth_effect_threshold
true_decreased_activation = ground_truth_effect[task] < -ground_truth_effect_threshold
true_no_change_in_activation = np.abs(ground_truth_effect[task]) < ground_truth_effect_threshold
spatial_true_positive_increased_activation = np.multiply(spatial_increased_activation, np.tile(true_increased_activation,(spatial_cluster_logps.shape[0],1))).sum(1)
spatial_true_positive_decreased_activation = np.multiply(spatial_decreased_activation, np.tile(true_decreased_activation,(spatial_cluster_logps.shape[0],1))).sum(1)
spatial_true_positive = spatial_true_positive_increased_activation + spatial_true_positive_decreased_activation
spatial_true_negative = np.multiply(spatial_no_change_in_activation, np.tile(true_no_change_in_activation,(spatial_cluster_logps.shape[0],1))).sum(1)
spatial_false_negative_increased_activation = np.multiply(spatial_no_change_in_activation, np.tile(true_increased_activation,(spatial_cluster_logps.shape[0],1))).sum(1)
spatial_false_negative_decreased_activation = np.multiply(spatial_no_change_in_activation, np.tile(true_decreased_activation,(spatial_cluster_logps.shape[0],1))).sum(1)
spatial_false_negative = spatial_false_negative_increased_activation + spatial_false_negative_decreased_activation
spatial_false_positive = Nv - (spatial_true_positive + spatial_true_negative + spatial_false_negative)
spatial_true_positive_rate = np.divide(spatial_true_positive, (spatial_true_positive + spatial_false_negative))
spatial_false_positive_rate = np.divide(spatial_false_positive, (spatial_false_positive + spatial_true_negative))
spatial_BM = spatial_true_positive_rate - spatial_false_positive_rate
TPR[task][f'N={sample_size}']['spatial'].append(spatial_true_positive_rate)
FPR[task][f'N={sample_size}']['spatial'].append(spatial_false_positive_rate)
BM[task][f'N={sample_size}']['spatial'].append(spatial_BM)
topological_true_positive_increased_activation = np.multiply(topological_increased_activation, np.tile(true_increased_activation,(topological_cluster_logps.shape[0],1))).sum(1)
topological_true_positive_decreased_activation = np.multiply(topological_decreased_activation, np.tile(true_decreased_activation,(topological_cluster_logps.shape[0],1))).sum(1)
topological_true_positive = topological_true_positive_increased_activation + topological_true_positive_decreased_activation
topological_true_negative = np.multiply(topological_no_change_in_activation, np.tile(true_no_change_in_activation,(topological_cluster_logps.shape[0],1))).sum(1)
topological_false_negative_increased_activation = np.multiply(topological_no_change_in_activation, np.tile(true_increased_activation,(topological_cluster_logps.shape[0],1))).sum(1)
topological_false_negative_decreased_activation = np.multiply(topological_no_change_in_activation, np.tile(true_decreased_activation,(topological_cluster_logps.shape[0],1))).sum(1)
topological_false_negative = topological_false_negative_increased_activation + topological_false_negative_decreased_activation
topological_false_positive = Nv - (topological_true_positive + topological_true_negative + topological_false_negative)
topological_true_positive_rate = np.divide(topological_true_positive, (topological_true_positive + topological_false_negative))
topological_false_positive_rate = np.divide(topological_false_positive, (topological_false_positive + topological_true_negative))
topological_BM = topological_true_positive_rate - topological_false_positive_rate
TPR[task][f'N={sample_size}']['topological'].append(topological_true_positive_rate)
FPR[task][f'N={sample_size}']['topological'].append(topological_false_positive_rate)
BM[task][f'N={sample_size}']['topological'].append(topological_BM)
TPR[task][f'N={sample_size}']['spatial'] = np.array(TPR[task][f'N={sample_size}']['spatial'])
FPR[task][f'N={sample_size}']['spatial'] = np.array(FPR[task][f'N={sample_size}']['spatial'])
BM[task][f'N={sample_size}']['spatial'] = np.array(BM[task][f'N={sample_size}']['spatial'])
TPR[task][f'N={sample_size}']['topological'] = np.array(TPR[task][f'N={sample_size}']['topological'])
FPR[task][f'N={sample_size}']['topological'] = np.array(FPR[task][f'N={sample_size}']['topological'])
BM[task][f'N={sample_size}']['topological'] = np.array(BM[task][f'N={sample_size}']['topological'])
Tasks loop: 0%| | 0/5 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Sample size loop: 0%| | 0/6 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
Thresholds loop: 0%| | 0/50 [00:00<?, ?it/s]
CPU times: user 7min 44s, sys: 13.3 s, total: 7min 57s Wall time: 7min 57s
def get_cohen_d_from_bonferroni_corrected_p_value(p, sample_size, multiple_comparisons):
# only for two tailed T test
unc_p = p / multiple_comparisons
t_stat = stats.t.isf(unc_p/2, sample_size-1)
cohen_d = t_stat / np.sqrt(sample_size)
return cohen_d
def get_bonferroni_corrected_p_value_from_cohen_d(d, sample_size, multiple_comparisons):
t_stat = d * np.sqrt(sample_size)
unc_p = stats.t.sf(np.abs(t_stat), sample_size-1)*2
p = unc_p * multiple_comparisons
return p
def get_uncorrected_p_value_from_cohen_d(d, sample_size, multiple_comparisons):
t_stat = d * np.sqrt(sample_size)
unc_p = stats.t.sf(np.abs(t_stat), sample_size-1)*2
return unc_p
def compute_normalized_partial_area_under_curve(fpr, tpr, lowest_fpr, highest_fpr):
fpr_limited = np.sort(fpr[(fpr > lowest_fpr) & (fpr < highest_fpr)])
tpr_limited = np.sort(tpr[(fpr > lowest_fpr) & (fpr < highest_fpr)])
return (metrics.auc(fpr_limited, tpr_limited) / (fpr_limited.max() - fpr_limited.min()))
def compute_mean_informedness(fpr, tpr, lowest_fpr, highest_fpr):
fpr_limited = np.sort(fpr[(fpr > lowest_fpr) & (fpr < highest_fpr)])
tpr_limited = np.sort(tpr[(fpr > lowest_fpr) & (fpr < highest_fpr)])
return (metrics.auc(fpr_limited, tpr_limited-fpr_limited) / (fpr_limited.max() - fpr_limited.min()))
def compute_normalized_concordant_partial_area_under_curve(fpr, tpr, lowest_fpr, highest_fpr):
fpr_limited = np.sort(fpr[(fpr > lowest_fpr) & (fpr < highest_fpr)])
tpr_limited = np.sort(tpr[(fpr > lowest_fpr) & (fpr < highest_fpr)])
return ((metrics.auc(fpr_limited, tpr_limited) / (fpr_limited.max() - fpr_limited.min())) + (metrics.auc(tpr_limited, 1-fpr_limited) / (tpr_limited.max() - tpr_limited.min())))/2
import scipy.stats as stats
from scipy.interpolate import CubicSpline
from scipy.interpolate import UnivariateSpline
from statsmodels.stats.power import TTestPower
from matplotlib.patches import Patch
%config InlineBackend.figure_format = 'retina'
plt.rc('figure', dpi=300)
analysis = TTestPower()
fig = plt.figure(figsize=(30, 12),constrained_layout=True)
gs = fig.add_gridspec(3, 5)
# fig.suptitle('Comparison of classification performance', fontsize=36, y=1.06)
sample_colors = np.array(sns.color_palette("rainbow", len(sample_sizes)))
logp_threshold = -np.log10(0.05)
for ci, task in enumerate(tasks):
for ri, method in enumerate(['spatial', 'topological', 'difference']):
ax = fig.add_subplot(gs[ri, ci])
nauc = []
for si, sample_size in enumerate(sample_sizes):
if method == 'difference':
tpr = TPR[task][f'N={sample_size}']['topological'].mean(1)
fpr = FPR[task][f'N={sample_size}']['topological'].mean(1)
bmt = BM[task][f'N={sample_size}']['topological']
bms = BM[task][f'N={sample_size}']['spatial']
bmdiff = bmt - bms
xlim = (min(ground_truth_effect_thresholds),max(ground_truth_effect_thresholds))
sns.lineplot(
x=ground_truth_effect_thresholds,
y=bmdiff.mean(1),
style=True,
color=np.append(sample_colors[si]/1.2, 1),
legend=False,
linewidth=2,
)
ax.fill_between(
ground_truth_effect_thresholds,
bmdiff.mean(1) - (stats.sem(bmdiff, axis=1)*1.96),
bmdiff.mean(1) + (stats.sem(bmdiff, axis=1)*1.96),
color = np.append(sample_colors[si], 0.3),
)
dtof = UnivariateSpline(ground_truth_effect_thresholds, fpr, s=0)
dtot = UnivariateSpline(ground_truth_effect_thresholds, tpr, s=0)
dtobmd = UnivariateSpline(ground_truth_effect_thresholds, bmdiff.mean(1), s=0)
nauc.append(
compute_normalized_partial_area_under_curve(
ground_truth_effect_thresholds,
bmdiff.mean(1),
get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 1),
get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 91282),
)
)
else:
tpr = TPR[task][f'N={sample_size}'][method].mean(1)
fpr = FPR[task][f'N={sample_size}'][method].mean(1)
bm = BM[task][f'N={sample_size}'][method]
sns.lineplot(
x=ground_truth_effect_thresholds,
y=bm.mean(1),
style=True,
color=np.append(sample_colors[si]/1.2, 1),
legend=False,
linewidth=2,
)
ax.fill_between(
ground_truth_effect_thresholds,
bm.mean(1) - (stats.sem(bm, axis=1)*1.96),
bm.mean(1) + (stats.sem(bm, axis=1)*1.96),
color = np.append(sample_colors[si], 0.3),
)
xlim = (min(ground_truth_effect_thresholds),max(ground_truth_effect_thresholds))
dtof = UnivariateSpline(ground_truth_effect_thresholds, fpr, s=0)
dtot = UnivariateSpline(ground_truth_effect_thresholds, tpr, s=0)
nauc.append(
compute_normalized_partial_area_under_curve(
ground_truth_effect_thresholds,
bm.mean(1),
get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 1),
get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 91282),
)
)
ax.set_ylim(-0.05,1.05)
leg = ax.legend(
handles=[Patch(facecolor=np.append(sample_colors[si], 1),) for si, sample_size in enumerate(sample_sizes)],
labels=['${:.1f}\%$'.format(100*x) for x in nauc],
loc='upper left',
ncol=1,
fontsize=18,
title="$AUC$",
title_fontsize=24,
labelspacing=1.,
handlelength=1.,
)
leg.set_zorder(30)
for patch in leg.get_patches():
patch.set_width(20)
patch.set_height(20)
patch.set_y(-4)
xlabel = ''
if ri == 2:
xlabel = 'Effect size threshold'
ax.set_xlabel(xlabel, fontsize=40)
ax.set_xlim(0.0,0.21)
ylabel = ''
if ci == 0:
ylabel = '{}\nTrue Positive Rate'.format(task)
axylim = ax.get_ylim()
ax.vlines(
x=float(get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 91282)),
ymin=axylim[0],
ymax=axylim[1],
linestyles='dashed',
colors=np.array([212,24,0])/255,
zorder = 21,
linewidth=2,
)
ax.fill_between(
[get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 91282), 0.21],
[axylim[0], axylim[0]],
[axylim[1], axylim[1]],
color = [0.7,0.7,0.7,0.6],
zorder = 20,
)
ax.vlines(
x=float(get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 1)),
ymin=axylim[0],
ymax=axylim[1],
linestyles='dashed',
colors=np.array([212,24,0])/255,
zorder = 21,
linewidth=2,
)
ax.fill_between(
[0, get_cohen_d_from_bonferroni_corrected_p_value(0.05, 1000, 1),],
[axylim[0], axylim[0]],
[axylim[1], axylim[1]],
color = [0.7,0.7,0.7,0.6],
zorder = 20,
)
ax.set_ylim(axylim)
ax.set_facecolor(np.array([234,234,242])/255)
ax.grid(color=(0.99,0.99,0.99,), linewidth=3)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.tick_params(axis='both', colors=(0.5,0.5,0.5), labelcolor=(0,0,0), direction='out')
fig.legend(
handles=[Patch(facecolor=np.append(sample_colors[si], 1),) for si, sample_size in enumerate(sample_sizes)],
labels=[f'N={sample_size}' for sample_size in sample_sizes],
loc='lower center',
bbox_to_anchor=(0.5, 1.0),
ncol=len(sample_sizes), fontsize=40,
)
plt.show()